dat %>%
ggplot(aes(year, lakeid, fill=is.na(daynum))) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~metric, nrow=6)
Remaining data issues:
Predict DOC daynum from other variable daynum’s in each northern lake
no_dups = dat %>% group_by(lakeid, year, metric) %>% summarise(N = n()) %>% filter(N == 1)
## `summarise()` has grouped output by 'lakeid', 'year'. You can override using
## the `.groups` argument.
n_lakes_wide = left_join(no_dups, dat) %>%
select(-sampledate) %>%
pivot_wider(names_from = metric, values_from = daynum) %>%
filter(lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR"))
## Joining, by = c("lakeid", "year", "metric")
hold_data = na.omit(data.frame(n_lakes_wide %>% ungroup() %>% select(-year, -N))) # , -doc_predict
all = lm(doc_epiMax~(iceon+straton+stratoff+energy+
stability+anoxia_summer+
secchi_max)*lakeid, data=hold_data)
i_o = lm(doc_epiMax~1, data=hold_data)
hold_step = step(i_o, scope=formula(all))
## Start: AIC=1761.44
## doc_epiMax ~ 1
##
## Df Sum of Sq RSS AIC
## + lakeid 6 74137 395186 1733.7
## + stratoff 1 15998 453325 1755.4
## <none> 469323 1761.4
## + secchi_max 1 1334 467989 1762.8
## + iceon 1 565 468758 1763.2
## + straton 1 551 468772 1763.2
## + energy 1 305 469018 1763.3
## + anoxia_summer 1 234 469089 1763.3
## + stability 1 65 469258 1763.4
##
## Step: AIC=1733.72
## doc_epiMax ~ lakeid
##
## Df Sum of Sq RSS AIC
## + stratoff 1 6524 388662 1731.9
## <none> 395186 1733.7
## + energy 1 2484 392702 1734.3
## + anoxia_summer 1 2400 392786 1734.3
## + iceon 1 595 394591 1735.4
## + straton 1 121 395065 1735.7
## + secchi_max 1 60 395126 1735.7
## + stability 1 56 395130 1735.7
## - lakeid 6 74137 469323 1761.4
##
## Step: AIC=1731.88
## doc_epiMax ~ lakeid + stratoff
##
## Df Sum of Sq RSS AIC
## <none> 388662 1731.9
## + energy 1 3261 385401 1731.9
## + anoxia_summer 1 2560 386102 1732.3
## + iceon 1 407 388255 1733.6
## - stratoff 1 6524 395186 1733.7
## + straton 1 202 388460 1733.8
## + stability 1 183 388479 1733.8
## + secchi_max 1 21 388640 1733.9
## + stratoff:lakeid 6 11160 377502 1737.2
## - lakeid 6 64663 453325 1755.4
doc_model = lm(doc_epiMax~(iceon+straton+stratoff+energy+
stability+anoxia_summer+
secchi_max)*lakeid,
data=n_lakes_wide,
na.action = na.exclude)
summary(doc_model)
##
## Call:
## lm(formula = doc_epiMax ~ (iceon + straton + stratoff + energy +
## stability + anoxia_summer + secchi_max) * lakeid, data = n_lakes_wide,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -150.321 -17.502 3.039 23.126 119.066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.83950 170.68959 0.380 0.7045
## iceon 0.22703 0.36262 0.626 0.5321
## straton 0.15476 0.36054 0.429 0.6683
## stratoff 0.51902 0.31510 1.647 0.1013
## energy -0.17673 0.22397 -0.789 0.4311
## stability -0.12534 0.14804 -0.847 0.3983
## anoxia_summer -0.03987 0.17586 -0.227 0.8209
## secchi_max -0.07954 0.08274 -0.961 0.3376
## lakeid.L 319.71870 449.12342 0.712 0.4775
## lakeid.Q 395.98906 440.10607 0.900 0.3695
## lakeid.C 235.35696 455.90738 0.516 0.6063
## lakeid^4 87.51056 456.38546 0.192 0.8482
## lakeid^5 -416.46790 462.59187 -0.900 0.3692
## lakeid^6 352.77072 445.11973 0.793 0.4291
## iceon:lakeid.L -1.09045 0.95880 -1.137 0.2569
## iceon:lakeid.Q -0.53411 0.94683 -0.564 0.5734
## iceon:lakeid.C -0.47248 0.95370 -0.495 0.6209
## iceon:lakeid^4 0.09223 0.99242 0.093 0.9261
## iceon:lakeid^5 0.44611 0.94858 0.470 0.6387
## iceon:lakeid^6 -0.10044 0.95529 -0.105 0.9164
## straton:lakeid.L -0.99644 0.96543 -1.032 0.3034
## straton:lakeid.Q 0.14198 0.87945 0.161 0.8719
## straton:lakeid.C 0.21534 0.97792 0.220 0.8260
## straton:lakeid^4 0.79795 1.02354 0.780 0.4367
## straton:lakeid^5 1.52519 0.99024 1.540 0.1253
## straton:lakeid^6 0.71254 0.87728 0.812 0.4177
## stratoff:lakeid.L -0.22517 0.84120 -0.268 0.7893
## stratoff:lakeid.Q -0.50789 0.75106 -0.676 0.4998
## stratoff:lakeid.C 0.17399 0.84436 0.206 0.8370
## stratoff:lakeid^4 -0.26699 0.93880 -0.284 0.7764
## stratoff:lakeid^5 0.87622 0.84751 1.034 0.3026
## stratoff:lakeid^6 -1.57643 0.76545 -2.059 0.0409 *
## energy:lakeid.L 0.35302 0.59611 0.592 0.5545
## energy:lakeid.Q -0.39372 0.55165 -0.714 0.4763
## energy:lakeid.C -0.19354 0.59663 -0.324 0.7460
## energy:lakeid^4 -0.13369 0.64901 -0.206 0.8370
## energy:lakeid^5 -0.43207 0.59716 -0.724 0.4703
## energy:lakeid^6 -0.02438 0.55976 -0.044 0.9653
## stability:lakeid.L -0.07321 0.34484 -0.212 0.8321
## stability:lakeid.Q 0.17086 0.37799 0.452 0.6518
## stability:lakeid.C -0.11656 0.39559 -0.295 0.7686
## stability:lakeid^4 -0.39170 0.34527 -1.134 0.2581
## stability:lakeid^5 -0.14418 0.44054 -0.327 0.7438
## stability:lakeid^6 -0.13469 0.43468 -0.310 0.7570
## anoxia_summer:lakeid.L 0.29492 0.47554 0.620 0.5359
## anoxia_summer:lakeid.Q 0.16199 0.46703 0.347 0.7291
## anoxia_summer:lakeid.C -0.07344 0.45279 -0.162 0.8713
## anoxia_summer:lakeid^4 -0.19354 0.50427 -0.384 0.7016
## anoxia_summer:lakeid^5 -0.16585 0.42883 -0.387 0.6994
## anoxia_summer:lakeid^6 0.05221 0.45990 0.114 0.9097
## secchi_max:lakeid.L 0.50747 0.23435 2.165 0.0317 *
## secchi_max:lakeid.Q -0.42967 0.23513 -1.827 0.0693 .
## secchi_max:lakeid.C -0.17325 0.22503 -0.770 0.4424
## secchi_max:lakeid^4 -0.08336 0.19516 -0.427 0.6698
## secchi_max:lakeid^5 -0.14772 0.21531 -0.686 0.4935
## secchi_max:lakeid^6 0.15293 0.20545 0.744 0.4576
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.84 on 180 degrees of freedom
## (37 observations deleted due to missingness)
## Multiple R-squared: 0.2977, Adjusted R-squared: 0.08308
## F-statistic: 1.387 on 55 and 180 DF, p-value: 0.05718
n_lakes_wide$doc_predict = predict(doc_model, n_lakes_wide)
cor = n_lakes_wide %>%
group_by(lakeid) %>%
summarise(r = paste("r =", round(cor(doc_epiMax, doc_predict, use="complete.obs"), 3)))
n_lakes_wide %>%
ggplot(aes(x=doc_epiMax, y=doc_predict, color=as.character(lakeid))) +
geom_point()+
theme_bw() +
labs(color="lakeid")+
geom_abline(slope=1, intercept = 0) +
facet_wrap(~lakeid) +
geom_text(aes(label=r), data=cor, x=300, y=0) +
labs(x="obs peak DOC (daynum)", y="modeled peak DOC (daynum)")
## Warning: Removed 37 rows containing missing values (`geom_point()`).
# not good but vaguely positive relationship; use this instead of median?
missing_doc = n_lakes_wide %>% filter(is.na(doc_epiMax))
missing_doc$doc_fill = round(predict(doc_model, missing_doc))
doc_fill = missing_doc %>%
select(lakeid, year, doc_fill) %>%
rename(daynum_fill = doc_fill) %>%
mutate(metric = "doc_epiMax") %>%
filter(year < 2000)
all_fill = bind_rows(doc_fill)
dat_filled = full_join(dat, all_fill) %>%
mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, FALSE)) %>%
mutate(daynum_fill = ifelse(is.na(daynum) & !is.na(daynum_fill), daynum_fill, daynum))
## Joining, by = c("lakeid", "metric", "year")
dat_filled$lakeid = factor(dat_filled$lakeid,
levels = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"),
ordered = T)
vars = c("iceoff", "straton", "secchi_all", "secchi_max","secchi_min", "zoopDensity","zoopDensity_CC",
"doc_epiMax","totpuf_epiMax", "totpuf_epiMin", "totpuf_hypoMax", "totpuf_hypoMin",
"anoxia", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
dat_filled$metric = factor(dat_filled$metric,
levels = rev(vars),
ordered=T)
dat_filled %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
## Additional trimming/filling
Trim to “good” years, fill FI ice data with Monona, then fill additional missing with median values for each lake/metric
df_yearsWant = dat_filled %>%
filter((lakeid %in% c("FI", "ME", "MO", "WI") & year %in% 1996:2018) |
(!(lakeid %in% c("FI", "ME", "MO", "WI")) & year %in% 1982:2018))
fi_icefill = df_yearsWant %>%
filter(lakeid == "MO" & metric %in% c("iceoff", "iceon")) %>%
mutate(lakeid = "FI") %>%
select(-daynum, -filled_flag, -sampledate) %>%
rename(icefill = daynum_fill)
df_yearsWant = df_yearsWant %>%
full_join(fi_icefill) %>%
mutate(daynum_fill = ifelse(is.na(daynum_fill) & !is.na(icefill),
icefill, daynum),
filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill) & !is.na(icefill),
TRUE, filled_flag))
## Joining, by = c("lakeid", "metric", "year")
df_yearsWant %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
all_lys = df_yearsWant %>%
select(lakeid, year) %>%
distinct()
metric = unique(df_yearsWant$metric)
all_lys_want = expand_grid(all_lys, metric)
dat_final = left_join(all_lys_want, df_yearsWant)
## Joining, by = c("lakeid", "year", "metric")
medians = dat_final %>%
group_by(lakeid, metric) %>%
summarise(median_daynum = median(daynum_fill, na.rm=T))
## `summarise()` has grouped output by 'lakeid'. You can override using the
## `.groups` argument.
dat_final = dat_final %>%
left_join(medians) %>%
mutate(daynum_fill = ifelse(is.na(daynum_fill), median_daynum, daynum_fill)) %>%
mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, filled_flag)) %>%
select(-icefill, -median_daynum)
## Joining, by = c("lakeid", "metric")
dat_final %>%
ggplot(aes(year, metric, fill=filled_flag)) +
geom_tile(color="grey") +
theme_bw() +
facet_wrap(~lakeid, nrow=6)
write_csv(dat_final, "Data/analysis_ready/final_combined_dates_filled_v2.csv")